This notebooks shows how to optimise the TensorFlow exported SavedModel by shrinking its size (to have less memory and disk footprints), and improving prediction latency. This can be accopmlished by applying the following:
The optimisation operations we apply in this example are from the TensorFlow Graph Conversion Tool, which is a c++ command-line tool. We use the Python APIs to call the c++ libraries.
The Graph Transform Tool is designed to work on models that are saved as GraphDef files, usually in a binary protobuf format. However, the model exported after training and estimator is in SavedModel format (saved_model.pb file + variables folder with variables.data-* and variables.index files).
We need to optimise the mode and keep it the SavedModel format. Thus, the optimisation steps will be:
In [1]:
import os
import numpy as np
from datetime import datetime
import tensorflow as tf
print "TensorFlow : {}".format(tf.__version__)
In [2]:
mnist = tf.contrib.learn.datasets.load_dataset("mnist")
train_data = mnist.train.images
train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
eval_data = mnist.test.images
eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)
NUM_CLASSES = 10
In [3]:
print "Train data shape: {}".format(train_data.shape)
print "Eval data shape: {}".format(eval_data.shape)
In [4]:
def model_fn(features, labels, mode, params):
is_training = True if mode == tf.estimator.ModeKeys.TRAIN else False
# convolution layers
def _cnn_layers(conv_inputs):
for i in range(params.num_conv_layers):
filters = params.init_filters * (2**i)
conv = tf.keras.layers.Conv2D(kernel_size=3, filters=filters, strides=1, padding='SAME')(conv_inputs)
pool = tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding='SAME')(conv)
batch_norm = tf.keras.layers.BatchNormalization()(pool, training=is_training)
conv_inputs = batch_norm
outputs = conv_inputs
return outputs
# fully-connected layers
def _fully_connected_layers(dense_inputs):
for i in range(len(params.hidden_units)):
dense = tf.keras.layers.Dense(params.hidden_units[i], activation='relu')(dense_inputs)
dense_dropout = tf.keras.layers.Dropout(params.dropout)(dense, training=is_training)
dense_inputs = dense_dropout
outputs = dense_inputs
return outputs
# model body
def _inference(features, mode, params):
input_layer = tf.keras.layers.Reshape([28, 28, 1])(features["input_image"])
convolutions = _cnn_layers(input_layer)
flatten = tf.keras.layers.Flatten()(convolutions)
fully_connected = _fully_connected_layers(flatten)
# unused_layer
unused_layers = tf.keras.layers.Dense(units=100, name='unused', activation=None)(flatten)
logits = tf.keras.layers.Dense(units=NUM_CLASSES, name='logits', activation=None)(fully_connected)
return logits
# model head
head = tf.contrib.estimator.multi_class_head(n_classes=NUM_CLASSES)
return head.create_estimator_spec(
features=features,
mode=mode,
logits=_inference(features, mode, params),
labels=labels,
optimizer=tf.train.AdamOptimizer(params.learning_rate)
)
In [5]:
def create_estimator(params, run_config):
# evaluation metric_fn
def _metric_fn(labels, predictions):
metrics = {}
pred_class = predictions['class_ids']
metrics['micro_accuracy'] = tf.metrics.mean_per_class_accuracy(
labels=labels, predictions=pred_class, num_classes=NUM_CLASSES
)
return metrics
mnist_classifier = tf.estimator.Estimator(
model_fn=model_fn, params=params, config=run_config)
mnist_classifier = tf.contrib.estimator.add_metrics(
estimator=mnist_classifier, metric_fn=_metric_fn)
return mnist_classifier
In [6]:
def run_experiment(hparam, run_config):
train_spec = tf.estimator.TrainSpec(
input_fn = tf.estimator.inputs.numpy_input_fn(
x={"input_image": train_data},
y=train_labels,
batch_size=hparam.batch_size,
num_epochs=None,
shuffle=True),
max_steps=hparams.max_traning_steps
)
eval_spec = tf.estimator.EvalSpec(
input_fn = tf.estimator.inputs.numpy_input_fn(
x={"input_image": eval_data},
y=eval_labels,
batch_size=hparam.batch_size,
num_epochs=1,
shuffle=False),
steps=None,
throttle_secs=hparams.eval_throttle_secs
)
tf.logging.set_verbosity(tf.logging.INFO)
time_start = datetime.utcnow()
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................")
estimator = create_estimator(hparams, run_config)
tf.estimator.train_and_evaluate(
estimator=estimator,
train_spec=train_spec,
eval_spec=eval_spec
)
time_end = datetime.utcnow()
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
return estimator
In [9]:
MODELS_LOCATION = 'models/mnist'
MODEL_NAME = 'cnn_classifier'
model_dir = os.path.join(MODELS_LOCATION, MODEL_NAME)
print model_dir
hparams = tf.contrib.training.HParams(
batch_size=100,
hidden_units=[512, 512],
num_conv_layers=3,
init_filters=64,
dropout=0.2,
max_traning_steps=50,
eval_throttle_secs=10,
learning_rate=1e-3
)
run_config = tf.estimator.RunConfig(
tf_random_seed=19830610,
save_checkpoints_steps=1000,
keep_checkpoint_max=3,
model_dir=model_dir
)
In [10]:
if tf.gfile.Exists(model_dir):
print("Removing previous artifacts...")
tf.gfile.DeleteRecursively(model_dir)
estimator = run_experiment(hparams, run_config)
In [11]:
def make_serving_input_receiver_fn():
inputs = {'input_image': tf.placeholder(shape=[None,784], dtype=tf.float32, name='input_image')}
return tf.estimator.export.build_raw_serving_input_receiver_fn(inputs)
export_dir = os.path.join(model_dir, 'export')
if tf.gfile.Exists(export_dir):
tf.gfile.DeleteRecursively(export_dir)
estimator.export_savedmodel(
export_dir_base=export_dir,
serving_input_receiver_fn=make_serving_input_receiver_fn()
)
Out[11]:
In [12]:
%%bash
saved_models_base=models/mnist/cnn_classifier/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo ${saved_model_dir}
ls ${saved_model_dir}
saved_model_cli show --dir=${saved_model_dir} --all
In [13]:
def inference_test(saved_model_dir, signature="predict", input_name='input_image', batch=300, repeat=100):
tf.logging.set_verbosity(tf.logging.ERROR)
time_start = datetime.utcnow()
predictor = tf.contrib.predictor.from_saved_model(
export_dir = saved_model_dir,
signature_def_key=signature
)
time_end = datetime.utcnow()
time_elapsed = time_end - time_start
print ""
print("Model loading time: {} seconds".format(time_elapsed.total_seconds()))
print ""
time_start = datetime.utcnow()
output = None
for i in range(repeat):
output = predictor(
{
input_name: eval_data[:batch]
}
)
time_end = datetime.utcnow()
time_elapsed_sec = (time_end - time_start).total_seconds()
print "Inference elapsed time: {} seconds".format(time_elapsed_sec)
print ""
print "Prediction produced for {} instances batch, repeated {} times".format(len(output['class_ids']), repeat)
print "Average latency per batch: {} seconds".format(time_elapsed_sec/repeat)
print ""
print "Prediction output for the last instance:"
for key in output.keys():
print "{}: {}".format(key,output[key][0])
In [14]:
saved_model_dir = os.path.join(export_dir, os.listdir(export_dir)[-1])
print(saved_model_dir)
inference_test(saved_model_dir)
In [15]:
def describe_graph(graph_def, show_nodes=False):
print 'Input Feature Nodes: {}'.format([node.name for node in graph_def.node if node.op=='Placeholder'])
print ""
print 'Unused Nodes: {}'.format([node.name for node in graph_def.node if 'unused' in node.name])
print ""
print 'Output Nodes: {}'.format( [node.name for node in graph_def.node if 'predictions' in node.name])
print ""
print 'Quanitization Nodes: {}'.format( [node.name for node in graph_def.node if 'quant' in node.name])
print ""
print 'Constant Count: {}'.format( len([node for node in graph_def.node if node.op=='Const']))
print ""
print 'Variable Count: {}'.format( len([node for node in graph_def.node if 'Variable' in node.op]))
print ""
print 'Identity Count: {}'.format( len([node for node in graph_def.node if node.op=='Identity']))
print ""
print 'Total nodes: {}'.format( len(graph_def.node))
print ''
if show_nodes==True:
for node in graph_def.node:
print 'Op:{} - Name: {}'.format(node.op, node.name)
In [16]:
def get_graph_def_from_saved_model(saved_model_dir):
print saved_model_dir
print ""
from tensorflow.python.saved_model import tag_constants
with tf.Session() as session:
meta_graph_def = tf.saved_model.loader.load(
session,
tags=[tag_constants.SERVING],
export_dir=saved_model_dir
)
return meta_graph_def.graph_def
In [17]:
describe_graph(get_graph_def_from_saved_model(saved_model_dir))
In [18]:
def get_size(model_dir):
print model_dir
print ""
pb_size = os.path.getsize(os.path.join(model_dir,'saved_model.pb'))
variables_size = 0
if os.path.exists(os.path.join(model_dir,'variables/variables.data-00000-of-00001')):
variables_size = os.path.getsize(os.path.join(model_dir,'variables/variables.data-00000-of-00001'))
variables_size += os.path.getsize(os.path.join(model_dir,'variables/variables.index'))
print "Model size: {} KB".format(round(pb_size/(1024.0),3))
print "Variables size: {} KB".format(round( variables_size/(1024.0),3))
print "Total Size: {} KB".format(round((pb_size + variables_size)/(1024.0),3))
In [19]:
get_size(saved_model_dir)
This function will convert the SavedModel into a GraphDef file (freezed_model.pb), and storing the variables as constrant to the freezed_model.pb
You need to define the graph output nodes for freezing. We are only interested in the class_id, which is produced by head/predictions/ExpandDims node
In [20]:
def freeze_graph(saved_model_dir):
from tensorflow.python.tools import freeze_graph
from tensorflow.python.saved_model import tag_constants
output_graph_filename = os.path.join(saved_model_dir, "freezed_model.pb")
output_node_names = "head/predictions/ExpandDims"
initializer_nodes = ""
freeze_graph.freeze_graph(
input_saved_model_dir=saved_model_dir,
output_graph=output_graph_filename,
saved_model_tags = tag_constants.SERVING,
output_node_names=output_node_names,
initializer_nodes=initializer_nodes,
input_graph=None,
input_saver=False,
input_binary=False,
input_checkpoint=None,
restore_op_name=None,
filename_tensor_name=None,
clear_devices=False,
input_meta_graph=False,
)
print "SavedModel graph freezed!"
In [21]:
freeze_graph(saved_model_dir)
In [22]:
%%bash
saved_models_base=models/mnist/cnn_classifier/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo ${saved_model_dir}
ls ${saved_model_dir}
In [23]:
def get_graph_def_from_file(graph_filepath):
print graph_filepath
print ""
from tensorflow.python import ops
with ops.Graph().as_default():
with tf.gfile.GFile(graph_filepath, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
return graph_def
In [24]:
freezed_filepath=os.path.join(saved_model_dir,'freezed_model.pb')
describe_graph(get_graph_def_from_file(freezed_filepath))
In [25]:
def optimize_graph(model_dir, graph_filename, transforms):
from tensorflow.tools.graph_transforms import TransformGraph
input_names = []
output_names = ['head/predictions/ExpandDims']
graph_def = get_graph_def_from_file(os.path.join(model_dir, graph_filename))
optimised_graph_def = TransformGraph(graph_def,
input_names,
output_names,
transforms
)
tf.train.write_graph(optimised_graph_def,
logdir=model_dir,
as_text=False,
name='optimised_model.pb')
print "Freezed graph optimised!"
In [26]:
transforms = [
'remove_nodes(op=Identity)',
'fold_constants(ignore_errors=true)',
'fold_batch_norms',
# 'fuse_resize_pad_and_conv',
# 'quantize_weights',
# 'quantize_nodes',
'merge_duplicate_nodes',
'strip_unused_nodes',
'sort_by_execution_order'
]
optimize_graph(saved_model_dir, 'freezed_model.pb', transforms)
In [27]:
%%bash
saved_models_base=models/mnist/cnn_classifier/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo ${saved_model_dir}
ls ${saved_model_dir}
In [28]:
optimised_filepath=os.path.join(saved_model_dir,'optimised_model.pb')
describe_graph(get_graph_def_from_file(optimised_filepath))
In [29]:
def convert_graph_def_to_saved_model(graph_filepath):
from tensorflow.python import ops
export_dir=os.path.join(saved_model_dir,'optimised')
if tf.gfile.Exists(export_dir):
tf.gfile.DeleteRecursively(export_dir)
graph_def = get_graph_def_from_file(graph_filepath)
with tf.Session(graph=tf.Graph()) as session:
tf.import_graph_def(graph_def, name="")
tf.saved_model.simple_save(session,
export_dir,
inputs={
node.name: session.graph.get_tensor_by_name("{}:0".format(node.name))
for node in graph_def.node if node.op=='Placeholder'},
outputs={
"class_ids": session.graph.get_tensor_by_name("head/predictions/ExpandDims:0"),
}
)
print "Optimised graph converted to SavedModel!"
In [30]:
optimised_filepath=os.path.join(saved_model_dir,'optimised_model.pb')
convert_graph_def_to_saved_model(optimised_filepath)
In [31]:
optimised_saved_model_dir = os.path.join(saved_model_dir,'optimised')
get_size(optimised_saved_model_dir)
In [32]:
%%bash
saved_models_base=models/mnist/cnn_classifier/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)/optimised
ls ${saved_model_dir}
saved_model_cli show --dir ${saved_model_dir} --all
In [33]:
optimised_saved_model_dir = os.path.join(saved_model_dir,'optimised')
print(optimised_saved_model_dir)
inference_test(saved_model_dir=optimised_saved_model_dir, signature='serving_default', input_name='input_image')
In [ ]:
PROJECT = 'ksalama-gcp-playground'
BUCKET = 'ksalama-gcs-cloudml'
REGION = 'europe-west1'
MODEL_NAME = 'mnist_classifier'
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['MODEL_NAME'] = MODEL_NAME
In [ ]:
%%bash
gsutil -m rm -r gs://${BUCKET}/tf-model-optimisation
In [ ]:
%%bash
saved_models_base=models/mnist/cnn_classifier/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo ${saved_model_dir}
gsutil -m cp -r ${saved_model_dir} gs://${BUCKET}/tf-model-optimisation/original
In [ ]:
%%bash
saved_models_base=models/mnist/cnn_classifier/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)/optimised
echo ${saved_model_dir}
gsutil -m cp -r ${saved_model_dir} gs://${BUCKET}/tf-model-optimisation
In [ ]:
%%bash
echo ${MODEL_NAME}
gcloud ml-engine models create ${MODEL_NAME} --regions=${REGION}
Version: v_org is the original SavedModel (before optimisation)
In [ ]:
%%bash
MODEL_VERSION='v_org'
MODEL_ORIGIN=gs://${BUCKET}/tf-model-optimisation/original
gcloud ml-engine versions create ${MODEL_VERSION}\
--model=${MODEL_NAME} \
--origin=${MODEL_ORIGIN} \
--runtime-version=1.10
Version: v_opt is the optimised SavedModel (after optimisation)
In [ ]:
%%bash
MODEL_VERSION='v_opt'
MODEL_ORIGIN=gs://${BUCKET}/tf-model-optimisation/optimised
gcloud ml-engine versions create ${MODEL_VERSION}\
--model=${MODEL_NAME} \
--origin=${MODEL_ORIGIN} \
--runtime-version=1.10
In [ ]:
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
credentials = GoogleCredentials.get_application_default()
api = discovery.build(
'ml', 'v1',
credentials=credentials,
discoveryServiceUrl='https://storage.googleapis.com/cloud-ml/discovery/ml_v1_discovery.json'
)
def predict(version, instances):
request_data = {'instances': instances}
model_url = 'projects/{}/models/{}/versions/{}'.format(PROJECT, MODEL_NAME, version)
response = api.projects().predict(body=request_data, name=model_url).execute()
class_ids = None
try:
class_ids = [item["class_ids"] for item in response["predictions"]]
except:
print response
return class_ids
In [ ]:
def inference_cmle(version, batch=100, repeat=10):
instances = [
{'input_image': [float(i) for i in list(eval_data[img])] }
for img in range(batch)
]
#warmup request
predict(version, instances[0])
print 'Warm up request performed!'
print 'Timer started...'
print ''
time_start = datetime.utcnow()
output = None
for i in range(repeat):
output = predict(version, instances)
time_end = datetime.utcnow()
time_elapsed_sec = (time_end - time_start).total_seconds()
print "Inference elapsed time: {} seconds".format(time_elapsed_sec)
print ""
print "Prediction produced for {} instances batch, repeated {} times".format(len(output), repeat)
print "Average latency per batch: {} seconds".format(time_elapsed_sec/repeat)
print ""
print "Prediction output for the last instance: {}".format(output[0])
In [ ]:
version='v_org'
inference_cmle(version)
In [ ]:
version='v_opt'
inference_cmle(version)
In [ ]: